args <- list(
  detect_summary_folder = here::here("jacusa2_detect/"),
  sample_annotation_file = here::here("metadata.txt"),
  qc_results_dir = here::here("processed_data"),
  output_folder = here::here("processed_data"),
  lengths_summary_path = here::here("genome_references/Homo_sapiens/GRCh38/annotation/ensembl93_gene_lengths_summary.csv"),
  reduced_gtf = here::here("genome_reference/Homo_sapiens/GRCh38/annotation/ensembl93_reduced_gtf.RDS")
)




plot_labels = c("Decreased_editing" = "Decreased editing",
                "Increased_editing" = "Increased editing",
                "Lost_edit_sites" = "Lost editing sites",
                "New_edit_sites" = "New editing sites",
                "astro_nt_vs_asyn" = "Astrocyte untreated vs a-syn",
                "neuron_nt_vs_asyn" = "Neuron untreated vs a-syn",
                "cocult_nt_vs_asyn" = "Coculture untreated vs a-syn",
                "astro_nt" = "Astrocyte untreated",
                "astro_asyn" = "Astrocyte a-syn",
                "cocult_nt" = "Coculture untreated",
                "cocult_asyn" = "Coculture a-syn",
                "neuron_nt" = "Neuron untreated",
                "neuron_asyn" = "Neuron a-syn",
                "exon" = "Exon",
                "intron" = "Intron",
                "intergenic" = "Intergenic",
                "not_significant" = "Not significant",
                "significant" = "Significant",
                "Astrocyte_increase" = "Astrocyte increase",
                "Astrocyte_decrease" = "Astrocyte decrease",
                "Neuron_increase" = "Neuron increase",
                "Neuron_decrease" = "Neuron decrease",
                "Coculture_increase" = "Coculture increase",
                "Coculture_decrease" = "Coculture decrease",
                "no_treatment" = "Baseline",
                "increase" = "Increase",
                "decrease" = "Decrease",
                "increased_sites" = "Increased editing rate",
                "decreased_sites" = "Decreased editing rate",
                "treated_edits" = "a-syn",
                "untreated_edits" = "Untreated",
                "exon" = "Exon",
                "intron" = "Intron",
                "intergenic" = "Intergenic",
                "increased" = "Increase",
                "decreased" = "Decreased"
                )

plot_labeller = as_labeller(plot_labels)



named_palette = c(
  increased_sites = "deeppink3",
  New = "deeppink3",
  increased = "deeppink3",
  new_sites = "hotpink1",
  decreased_sites = "#2a75b2",
  Lost = "#2a75b2",
  decreased = "#2a75b2",
  lost_sites = "lightskyblue",
  astro_nt = "#7FDEEA",
  astro_asyn = "#0097A6",
  neuron_nt = "#FF91C8",
  neuron_asyn = "deeppink3",
  cocult_nt = "#9FA7D8",
  cocult_asyn = "#303F9F",
  Control = "#1d9064",
  no_treatment = "#1d9064",
  DLB = "#ce490a",
  PD  = "#625aa4",
  `1uM_syn_oligomer`= "#625aa4",
  PDD = "#de0077",
    exon = "#fc6a0f",
  intron = "#9db8e0",
  intergenic = "#1a62a5"
)

0.1 Background data re samples

Sample annotation and information

# Load gene lengths
gene_lengths <- read_delim(args$lengths_summary_path)

editing_metrics <- c("Edits", "edit_adjusted")
outcome_to_explore = "Disease_Group"
covariates_used = c("RIN", "Sex")
# Load annotation file
metadata <- read_delim(args$sample_annotation_file)
metadata <- metadata %>% 
   dplyr::select(Sample = sample_name, Disease_Group, Sex, AoO, AoD, DD, PMI, RIN = RINe_bulkRNA_Tapestation, aSN, TAU, AB = `thal AB`, aCG_aSN_score = `aCG aSN score`) %>% 
  dplyr::mutate(sample_id = Sample)

Samples per disease group

RIN_bar <- metadata %>% 
  ggplot(aes(x=Sample, y = RIN, fill = Disease_Group)) +
  geom_bar(stat = "identity") +
  theme_aw +
    scale_fill_manual(values = named_palette) + 
  labs(title = "Samples RIN")
  

AOD_bar <- metadata %>% 
  ggplot(aes(x=Sample, y = AoD, fill = Disease_Group)) +
  geom_bar(stat = "identity") +
  theme_aw +
      scale_fill_manual(values = named_palette) + 
  labs(title = "Age of death")

PMI_bar <- metadata %>% 
  ggplot(aes(x=Sample, y = PMI, fill = Disease_Group)) +
  geom_bar(stat = "identity") +
  theme_aw +
      scale_fill_manual(values = named_palette) + 
  labs(title = "Post mortem interval")

sex_bar <- metadata %>% 
  ggplot(aes(x=Disease_Group, fill = Sex)) +
  geom_bar(stat = "count",
           position = "stack") +
  theme_aw +
      scale_fill_paletteer_d("werpals::pan",
                        direction = -1) +
  labs(title = "Samples: sex")



(RIN_bar | AOD_bar | PMI_bar | sex_bar | plot_layout(guides = "collect", widths = c(2,2,2,1)))

# Load file of significant results
# master_df <- readRDS(args$detect_summary_comparison_sig_file)
# master_df$Disease_Group <- factor(master_df$Disease_Group)

1 Load edit site files

edit_sites_files <- list.files(args$detect_summary_folder,
                               pattern = ".site_edits.csv",
                              full.names = T)


load_edit_sites <- function(file_path, metadata_df) {
  sites <- read_delim(file_path)
  sites <- sites %>% 
    dplyr::filter(z.score <= -1.96 | z.score >= 1.96) %>% 
    left_join(metadata,
              by = c("Sample" = "sample_id"))
  return(sites)
}
contrasts = list(PD = c("Control", "PD"),
                 DLB = c("Control", "DLB"),
                 PDD = c("Control", "PDD"))
region_to_explore <- NULL

2 Results

2.1 Non fully edited sites

for (contrast_number in 1:length(contrasts)) {
    
    contrast_to_explore <- contrasts[[contrast_number]]
    # Generate Markdown header

if (is.null(region_to_explore)){
  cat("##", contrast_to_explore[1], " vs. ", contrast_to_explore[[2]], "\n\n")
} else if (!is.null(region_to_explore)){
  cat("##", region_to_explore, ":",  contrast_to_explore[1], " vs. ", contrast_to_explore[[2]], "\n\n")
} 
 
   # Analysis and plotting code for the current region and contrast
    contrast_list <- list(Control = contrast_to_explore[1],
                          Disease = contrast_to_explore[2])
    disease = contrast_to_explore[2]
    
    print(str_c("Exploring ", region_to_explore, ": ", contrast_to_explore[1], " vs. ", contrast_to_explore[[2]]))
    
    
    comparison_df <- filter_and_load_editing_results(metadata_df = metadata,
                                                     edit_files_list = edit_sites_files,
                                                     contrast_to_explore = contrast_to_explore,
                                                     region_to_explore = region_to_explore
    )
    
    # Replace NAs in genic type with intergenic, and create column of collapsed biotypes for plotting
    comparison_df <- comparison_df %>%
      dplyr::mutate(biotype_plots = as_factor(gene_biotype),
                    gene_type = as_factor(gene_type) %>% fct_relevel(filtering_args$gtf_genic_type)) %>% 
      remove_suffix_from_gene_id(., column = "gene_id")
    
    # Recode and reorder biotype levels for plotting
    levels(comparison_df$biotype_plots) <- vep_levels_BIOTYPE

    # Factorise locus
    comparison_df$locus <- factor(comparison_df$locus)
    
    # Factorise disease group, checking levels in data match levels in factor to be
    comparison_df$Disease_Group <- factor(comparison_df$Disease_Group)
    dataframe_levels <- levels(comparison_df$Disease_Group)
    if (!all(dataframe_levels %in% contrast_to_explore) && all(contrast_to_explore %in% dataframe_levels)) {
      errorCondition("Supplied sorting factor does not match levels in data")
    }
    levels(comparison_df$Disease_Group) <- contrast_to_explore
    
    # Find sample counts and consequences for annotation later
    sample_counts <- comparison_df %>%
      distinct(Sample, Disease_Group) %>%
      count(Disease_Group)
    
    loci_consequences <- comparison_df %>%
      dplyr::select(locus, gene_name, biotype_plots, gene_type, repeat_class) %>%
      dplyr::distinct(locus, .keep_all = T)
    
    # Filter by number of sites common within groups and between samples
    sites_to_explore <- copy(comparison_df)
    number_common_samples_per_group = 2
    site_label <- "common_2_sample"
    site_annotation <- "Sites common to 2 samples per group: "
    
    common_loci <- find_sites_loci_common_to_x_samples(sites_df = sites_to_explore,
                                                       sample_counts_df = sample_counts,
                                                       no_common_samples_per_site = number_common_samples_per_group )
    
    sites_subset <- filter_sites_by_locus_vector(sites_df = sites_to_explore,
                                                 vector_of_loci = common_loci,
                                                 factor_list = contrast_list)
    

      # Baseline plots: rate and site number
    # edit_mean_rate_box <- create_box_plot_editing_metric_per_disease_group(edit_summary_df = sites_subset,
    #                                                                        plotting_variable = "Edits") +
    #   scale_y_continuous(trans = "identity") +
    #   labs(subtitle = "Editing rate per sample")
    
    edit_mean_rate_histogram <- plot_residual_histogram(regression_df = sites_subset,
                                                        resid_column = "Edits",
                                                        fill_variable = "Disease_Group") +
      labs(x = "Raw editing rate")
    
    
    total_sites <- plot_site_number_boxplot(sites_subset)
    
    number_sites_per_sample <- sites_subset %>% 
      count(Sample) %>% 
      dplyr::rename(n_sites_per_sample = n)
    
    sites_subset <- sites_subset %>% 
      left_join(number_sites_per_sample,
                by = "Sample")
    
    
    ###  Regression by sites #### 
    lm_results_sites <- regress_editing(sites_df = sites_subset,
                                        editing_metric = "Edits",
                                        covariates = covariates_used,
                                        factor_list = contrast_list,
                                        outcome_measure = outcome_to_explore,
                                        disease = disease)
    

    site_residual_histogram <- plot_residual_histogram(regression_df = lm_results_sites$sites_subset_residuals,
                                                                resid_column = '.resid',
                                                                fill_variable = 'Disease_Group') +
      labs(subtitle = "Site residuals, no outcome")
    
    
    mean_residual_histogram <-  plot_residual_histogram(regression_df = lm_results_sites$mean_residuals_df %>% 
                                                          dplyr::mutate(Disease_Group = fct_recode(Disease_Group, !!disease := "Disease")),
                                                                resid_column = 'mean_residuals_per_feature',
                                                                fill_variable = 'Disease_Group') +
      labs(subtitle = "Mean residuals per feature, no outcome")
    
    
    sites_residual_histogram_with_outcome <-  plot_residual_histogram(regression_df = lm_results_sites$sites_subset_residuals_with_outcome,
                                                                resid_column = '.resid',
                                                                fill_variable = 'Disease_Group') +
      labs(subtitle = "Site residuals, with outcome")
    
    
    
    forest_plot_sites <- create_forest_plot_from_lm(linear_model = lm_results_sites$lm_results_with_outcome,
                                                    disease = disease) +
      labs(title = "Regression of sites")
    
    
    ## Compare baseline to change  for plotting
    compare_baseline_df <- compare_proportions_baseline_altered(lm_results_list = lm_results_sites,
                                                                loci_consequences = loci_consequences,
                                                                factor_list = contrast_list)
    genic_dif_plot <- plot_compare_proportions_baseline(compare_proportions_df = compare_baseline_df,
                                                        metric_to_plot = "gene_type",
                                                        fill_label = "Genic Location")
    biotype_dif_plot <- plot_compare_proportions_baseline(compare_proportions_df = compare_baseline_df,
                                                          metric_to_plot = "biotype_plots",
                                                          fill_label = "Biotype")
    
    ### Create gene editing summary metric
    gene_metric_df <- sites_subset %>%
      get_editing_summary_metric(summary_df = .,
                                 gene_lengths_df = gene_lengths,
                                 outcome_measure = outcome_to_explore,
                                 covariates = covariates_used,
                                 factor_list = contrast_list) %>% 
      left_join(.,
                number_sites_per_sample,
                by = "Sample")
    
    # Plot number of sites per gene
    no_sites_boxplot <- create_box_plot_editing_metric_per_disease_group(edit_summary_df = gene_metric_df,
                                                                         plotting_variable = "no_sites") +
      scale_y_continuous(trans = "identity") +
      labs(subtitle = "No sites per gene", x = NULL)
    
    # Define metrics and labels in a named list
    editing_metrics <- list(
      mean_adjusted = "Mean edits per gene",
      no_sites_adjusted = "No. of sites per gene",
      mean_over_exon_length_adjusted = "Mean over exon length",
      mean_x_no_sites_adjusted = "Mean * No. of sites",
      complete_mean_adjusted = "Complete metric / exon length",
      complete_mean_genelength_adjusted = "Complete metric / gene length"
    )
    
    # Use lapply to iterate over each metric and return a list of plots of various parts of the gene metric
    forest_plots <- lapply(names(editing_metrics), function(edit_variable_to_plot) {
      
      edit_variable_label <- editing_metrics[[edit_variable_to_plot]]
      
      lm_results <- regress_editing(
        sites_df = gene_metric_df,
        editing_metric = edit_variable_to_plot,
        covariates = covariates_used,
        factor_list = contrast_list,
        outcome_measure = outcome_to_explore,
        disease = disease
      )
      
      forest_plot <- create_forest_plot_from_lm(linear_model = lm_results$lm_results_with_outcome,
                                                disease = disease) +
        labs(title = edit_variable_label)
      
      return(forest_plot)
    })
    
    # Name the list elements according to the editing metric names
    names(forest_plots) <- names(editing_metrics)
    title_forest <- title_plot("Regressions")
    
    all_forest_plots <- c(list(title_forest, forest_plot_sites = forest_plot_sites), forest_plots[names(editing_metrics)])
    
    
    ### Run regression with covariates and metrics all scaled, and n_site_per_gene log transformed
    sites_subset_scaled <- sites_subset %>% 
      dplyr::mutate(RIN = scale(RIN))
    
    lm_results_sites_scaled <- regress_editing(sites_df = sites_subset_scaled,
                                               editing_metric = "Edits",
                                               covariates = c("n_sites_per_sample", covariates_used),
                                               factor_list = contrast_list,
                                               outcome_measure = outcome_to_explore,
                                               disease = disease)
    
    forest_plot_sites_scaled <- create_forest_plot_from_lm(linear_model = lm_results_sites_scaled$lm_results_with_outcome,
                                                           disease = disease) +
      labs(title = "Regression of sites - scaled")
    
    gene_metric_df_scaled <- sites_subset_scaled %>%
      get_editing_summary_metric_scaled(summary_df = .,
                                 gene_lengths_df = gene_lengths,
                                 outcome_measure = outcome_to_explore,
                                 covariates = covariates_used,
                                 factor_list = contrast_list) %>% 
      left_join(.,
                number_sites_per_sample,
                by = "Sample")
    
    
    # Use lapply to iterate over each metric and return a list of plots of various parts of the gene metric
    forest_plots_scaled <- lapply(names(editing_metrics), function(edit_variable_to_plot) {
      
      edit_variable_label <- editing_metrics[[edit_variable_to_plot]]
      
      lm_results_scaled <- regress_editing(
        sites_df = gene_metric_df_scaled,
        editing_metric = edit_variable_to_plot,
        covariates = c("n_sites_per_sample", covariates_used),
        factor_list = contrast_list,
        outcome_measure = outcome_to_explore,
        disease = disease
      )
      
      forest_plot_scaled <- create_forest_plot_from_lm(linear_model = lm_results_scaled$lm_results_with_outcome,
                                                       disease = disease) +
        labs(title = edit_variable_label)
      
      return(forest_plot_scaled)
    })
    
    # Name the list elements according to the editing metric names
    names(forest_plots_scaled) <- names(editing_metrics)
    title_scaled <- title_plot("Scale site covariates \n and log10 nsites per gene")
    
    all_forest_plots_scaled <- c(list(title_scaled, forest_plot_sites = forest_plot_sites_scaled), forest_plots_scaled[names(editing_metrics)])
    
    
    
    
    
    ### Run regression covaried by n sites per sample
    # Regression by number of sites + n_sites_per_sample
    lm_results_sites_nsites <- regress_editing(sites_df = sites_subset,
                                               editing_metric = "Edits",
                                               covariates = c("n_sites_per_sample", covariates_used),
                                               factor_list = contrast_list,
                                               outcome_measure = outcome_to_explore,
                                               disease = disease)
    
    forest_plot_sites_nsites <- create_forest_plot_from_lm(linear_model = lm_results_sites_nsites$lm_results_with_outcome,
                                                           disease = disease) +
      labs(title = "Regression of sites w site_no")
    
    
     gene_metric_df <- sites_subset %>%
      get_editing_summary_metric(summary_df = .,
                                 gene_lengths_df = gene_lengths,
                                 outcome_measure = outcome_to_explore,
                                 covariates = covariates_used,
                                 factor_list = contrast_list) %>% 
      left_join(.,
                number_sites_per_sample,
                by = "Sample")
    
    # Use lapply to iterate over each metric and return a list of plots of various parts of the gene metric
    forest_plots_nsites <- lapply(names(editing_metrics), function(edit_variable_to_plot) {
      
      edit_variable_label <- editing_metrics[[edit_variable_to_plot]]
      
      lm_results_nsites <- regress_editing(
        sites_df = gene_metric_df,
        editing_metric = edit_variable_to_plot,
        covariates = c("n_sites_per_sample", covariates_used),
        factor_list = contrast_list,
        outcome_measure = outcome_to_explore,
        disease = disease
      )
      
      forest_plot_nsites <- create_forest_plot_from_lm(linear_model = lm_results_nsites$lm_results_with_outcome,
                                                       disease = disease) +
        labs(title = edit_variable_label)
      
      return(forest_plot_nsites)
    })
    
    # Name the list elements according to the editing metric names
    names(forest_plots_nsites) <- names(editing_metrics)
    title_nsites <- title_plot("Regressions covaried \n for nsites per sample")
    
    all_forest_plots_nsites <- c(list(title_nsites, forest_plot_sites = forest_plot_sites_nsites), forest_plots_nsites[names(editing_metrics)])
    
    
    
    
    
    ### Create summary plot
    cat("\n")
    print(summary_plot <- (edit_mean_rate_histogram | total_sites | no_sites_boxplot |sites_residual_histogram_with_outcome | site_residual_histogram | mean_residual_histogram |  genic_dif_plot ) /
            (Reduce(`|`, all_forest_plots)) +
            (Reduce(`|`, all_forest_plots_scaled)) + 
            (Reduce(`|`, all_forest_plots_nsites)) +
            plot_layout(guides = "collect") +
            plot_annotation(title = paste0(region_to_explore, " : ", disease, " verse control")))
    
    if (is.null(region_to_explore)){
      ggsave(summary_plot,
             file = here::here("figures/update_figures/",
                               str_c(disease, ".png")),
             device = "png",
             units = "cm",
             height = 44,
             width = 60)  
    } else if (!is.null(region_to_explore)){
      ggsave(summary_plot,
             file = here::here("figures/update_figures/",
                               str_c(region_to_explore, "_", disease, ".png")),
             device = "png",
             units = "cm",
             height = 44,
             width = 60)  
    } 
    

    
    
    cat("\n\n")

}

2.2 Control vs. PD

[1] “Exploring : Control vs. PD” [1] “Exploring: Control vs. PD” [1] “Control group; 5 samples” [1] “PD group; 7 samples” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “48 out of 12099 genes do not have a gene length.” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “48 out of 12099 genes do not have a gene length.” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “48 out of 12099 genes do not have a gene length.”

2.3 Control vs. DLB

[1] “Exploring : Control vs. DLB” [1] “Exploring: Control vs. DLB” [1] “Control group; 5 samples” [1] “DLB group; 6 samples” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “36 out of 9812 genes do not have a gene length.” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “36 out of 9812 genes do not have a gene length.” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “36 out of 9812 genes do not have a gene length.”

2.4 Control vs. PDD

[1] “Exploring : Control vs. PDD” [1] “Exploring: Control vs. PDD” [1] “Control group; 5 samples” [1] “PDD group; 6 samples” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “52 out of 11924 genes do not have a gene length.” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “52 out of 11924 genes do not have a gene length.” [1] “Grouping gene summary by:” [1] “Sample” “gene_id” “Disease_Group” “RIN”
[5] “Sex”
[1] “52 out of 11924 genes do not have a gene length.”

2.5 Fully edited sites

for (contrast_number in 1:length(contrasts)) {
  
  contrast_number =1
  quantile_cuttoff <- 0.9
  
  contrast_to_explore <- contrasts[[contrast_number]]
  # Generate Markdown header
  
  if (is.null(region_to_explore)){
    cat("##", contrast_to_explore[1], " vs. ", contrast_to_explore[[2]], "\n\n")
  } else if (!is.null(region_to_explore)){
    cat("##", region_to_explore, ":",  contrast_to_explore[1], " vs. ", contrast_to_explore[[2]], "\n\n")
  } 
  
  # Analysis and plotting code for the current region and contrast
  contrast_list <- list(Control = contrast_to_explore[1],
                        Disease = contrast_to_explore[2])
  disease = contrast_to_explore[2]
  
  print(str_c("Exploring ", region_to_explore, ": ", contrast_to_explore[1], " vs. ", contrast_to_explore[[2]]))
  
  
  comparison_df <- filter_and_load_editing_results(metadata_df = metadata,
                                                   edit_files_list = edit_sites_files,
                                                   contrast_to_explore = contrast_to_explore,
                                                   region_to_explore = region_to_explore
  )
  
  # Replace NAs in genic type with intergenic, and create column of collapsed biotypes for plotting
  comparison_df <- comparison_df %>%
    dplyr::mutate(biotype_plots = as_factor(gene_biotype),
                  gene_type = as_factor(gene_type) %>% fct_relevel(filtering_args$gtf_genic_type)) %>% 
    remove_suffix_from_gene_id(., column = "gene_id")
  
  # Recode and reorder biotype levels for plotting
  levels(comparison_df$biotype_plots) <- vep_levels_BIOTYPE
  
  # Factorise locus
  comparison_df$locus <- factor(comparison_df$locus)
  
  # Factorise disease group, checking levels in data match levels in factor to be
  comparison_df$Disease_Group <- factor(comparison_df$Disease_Group)
  dataframe_levels <- levels(comparison_df$Disease_Group)
  if (!all(dataframe_levels %in% contrast_to_explore) && all(contrast_to_explore %in% dataframe_levels)) {
    errorCondition("Supplied sorting factor does not match levels in data")
  }
  levels(comparison_df$Disease_Group) <- contrast_to_explore
  
  
  outliers <- comparison_df %>% 
    dplyr::filter(Edits == 1)
  
  
  outlier_bar <- outliers %>% 
    ggplot(aes(x = Disease_Group, fill = Disease_Group)) +
    geom_bar(stat = "count") +
    scale_fill_manual(values = named_palette) +
    theme_aw +
    labs(title = "100% edited sites")
  
  outlier_genic_type <- outliers %>% 
    ggplot(aes(x = Disease_Group, fill = gene_type)) +
    geom_bar(stat = "count", position = "fill") +
    scale_fill_manual(values = named_palette) +
    theme_aw +
    labs(title = "100% edited sites", subtitle = "Genic location")
  
  
  outlier_biotype <- outliers %>% 
    ggplot(aes(x = Disease_Group, fill = biotype_plots)) +
    geom_bar(stat = "count", position = "fill") +
    scale_fill_paletteer_d("ggthemes::Classic_20",
                           na.value = "grey90",
                           name = "Biotype",
                           labels = plot_labels) +
    theme_aw +
    labs(title = "100% edited sites", subtitle = "Biotype")
  
  
  
  
  
  heatmap_data <- comparison_df %>%
    group_by(gene_type, repeat_class) %>%
    summarize(count = n()) %>% 
    mutate(gene_type = factor(gene_type, levels = filtering_args$gtf_genic_type),
           repeat_class = factor(repeat_class, levels = filtering_args$alu_repeat_levels))
  
  
  
  complete_correlation_plot <- ggplot(heatmap_data, aes(x = repeat_class, y = fct_rev(gene_type), fill = log10(count))) +
    geom_tile() +
    scale_fill_viridis() +  # Adjust color scale as needed
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
    labs(title = "Count of overlapping genic locations and repeat motifs", 
         subtitle = "All editing sites",
         x = "Repeat Class",
         y = "Gene Type")
  
  
  outlier_heatmap_data <- outliers %>%
    group_by(gene_type, repeat_class) %>%
    summarize(count = n()) %>% 
    mutate(gene_type = factor(gene_type, levels = filtering_args$gtf_genic_type),
           repeat_class = factor(repeat_class, levels = filtering_args$alu_repeat_levels))
  
  
  outlier_correlation_plot <- ggplot(outlier_heatmap_data, aes(x = repeat_class, y = fct_rev(gene_type), fill = log10(count))) +
    geom_tile() +
    scale_fill_viridis() +  # Adjust color scale as needed
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
    labs(title = "Count of overlapping genic locations and repeat motifs", 
         subtitle = "100% Edited sites",
         x = "Repeat Class",
         y = "Gene Type") 
  
  
       ### Create summary plot
    cat("\n")
    
    print(summary_plot <- (complete_correlation_plot) / (outlier_correlation_plot ) /
            (outlier_bar | outlier_genic_type | outlier_biotype)) +
      plot_annotation(title = paste0(region_to_explore, " : ", disease, " verse control"))
          

    if (is.null(region_to_explore)){
      ggsave(summary_plot,
             file = here::here("figures/update_figures/",
                               str_c(disease, "_fully_edited_sites.png")),
             device = "png",
             units = "cm",
             height = 44,
             width = 60)  
    } else if (!is.null(region_to_explore)){
      ggsave(summary_plot,
             file = here::here("figures/update_figures/",
                               str_c(region_to_explore, "_", disease, "_fully_edited_sites.png")),
             device = "png",
             units = "cm",
             height = 44,
             width = 60)  
    } 
    

    
    
    cat("\n\n")

  
   
}

2.6 Control vs. PD

[1] “Exploring : Control vs. PD” [1] “Exploring: Control vs. PD”

2.7 Control vs. PD

[1] “Exploring : Control vs. PD” [1] “Exploring: Control vs. PD”

2.8 Control vs. PD

[1] “Exploring : Control vs. PD” [1] “Exploring: Control vs. PD”

2.9 New/lost sites

for (contrast_number in 1:length(contrasts)) {
  
  contrast_to_explore <- contrasts[[contrast_number]]
  # Generate Markdown header
  
  if (is.null(region_to_explore)){
    cat("##", contrast_to_explore[1], " vs. ", contrast_to_explore[[2]], "\n\n")
  } else if (!is.null(region_to_explore)){
    cat("##", region_to_explore, ":",  contrast_to_explore[1], " vs. ", contrast_to_explore[[2]], "\n\n")
  } 
  
  # Analysis and plotting code for the current region and contrast
  contrast_list <- list(Control = contrast_to_explore[1],
                        Disease = contrast_to_explore[2])
  disease = contrast_to_explore[2]
  
  print(str_c("Exploring ", region_to_explore, ": ", contrast_to_explore[1], " vs. ", contrast_to_explore[[2]]))
  
  
  comparison_df <- filter_and_load_editing_results(metadata_df = metadata,
                                                   edit_files_list = edit_sites_files,
                                                   contrast_to_explore = contrast_to_explore,
                                                   region_to_explore = region_to_explore
  )
  
  # Replace NAs in genic type with intergenic, and create column of collapsed biotypes for plotting
  comparison_df <- comparison_df %>%
    dplyr::mutate(biotype_plots = as_factor(gene_biotype),
                  gene_type = as_factor(gene_type) %>% fct_relevel(filtering_args$gtf_genic_type)) %>% 
    remove_suffix_from_gene_id(., column = "gene_id")
  
  # Recode and reorder biotype levels for plotting
  levels(comparison_df$biotype_plots) <- vep_levels_BIOTYPE
  
  # Factorise locus
  comparison_df$locus <- factor(comparison_df$locus)
  
  # Factorise disease group, checking levels in data match levels in factor to be
  comparison_df$Disease_Group <- factor(comparison_df$Disease_Group)
  dataframe_levels <- levels(comparison_df$Disease_Group)
  if (!all(dataframe_levels %in% contrast_to_explore) && all(contrast_to_explore %in% dataframe_levels)) {
    errorCondition("Supplied sorting factor does not match levels in data")
  }
  levels(comparison_df$Disease_Group) <- contrast_to_explore
  
  # Find sample counts and consequences for annotation later
  sample_counts <- comparison_df %>%
    distinct(Sample, Disease_Group) %>%
    count(Disease_Group)
  
  loci_consequences <- comparison_df %>%
    dplyr::select(locus, gene_name, biotype_plots, gene_type, repeat_class) %>%
    dplyr::distinct(locus, .keep_all = T)
  
  # Filter by number of sites common within groups and between samples
  sites_to_explore <- copy(comparison_df)
  number_common_samples_per_group = 2
  site_label <- "common_2_sample"
  site_annotation <- "Sites common to 2 samples per group: "
  
  
  new_lost_sites <- find_new_lost_sites(sites_df = sites_to_explore,
                                        sample_counts_df = sample_counts,
                                        no_common_samples_per_site = number_common_samples_per_group ) %>% 
    left_join(., loci_consequences,
              by = "locus")
  
  
  newlost_bar <- new_lost_sites %>% 
    ggplot(aes(x = new_lost, fill = new_lost)) +
    geom_bar(stat = "count") +
    scale_fill_manual(values = named_palette) +
    theme_aw +
    labs(title = "100% edited sites")
  
  newlost_genic_type <- new_lost_sites %>% 
    ggplot(aes(x = new_lost, fill = gene_type)) +
    geom_bar(stat = "count", position = "fill") +
    scale_fill_manual(values = named_palette) +
    theme_aw +
    labs(title = "100% edited sites", subtitle = "Genic location")
  
  
  newlost_biotype <- new_lost_sites %>% 
    ggplot(aes(x = new_lost, fill = biotype_plots)) +
    geom_bar(stat = "count", position = "fill") +
    scale_fill_paletteer_d("ggthemes::Classic_20",
                           na.value = "grey90",
                           name = "Biotype",
                           labels = plot_labels) +
    theme_aw +
    labs(title = "100% edited sites", subtitle = "Biotype")
  
  
  
  
  
  heatmap_data <- comparison_df %>%
    group_by(gene_type, repeat_class) %>%
    summarize(count = n()) %>% 
    mutate(gene_type = factor(gene_type, levels = filtering_args$gtf_genic_type),
           repeat_class = factor(repeat_class, levels = filtering_args$alu_repeat_levels))
  
  
  
  complete_correlation_plot <- ggplot(heatmap_data, aes(x = repeat_class, y = fct_rev(gene_type), fill = log10(count))) +
    geom_tile() +
    scale_fill_viridis() +  # Adjust color scale as needed
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
    labs(title = "Count of overlapping genic locations and repeat motifs", 
         subtitle = "All editing sites",
         x = "Repeat Class",
         y = "Gene Type")
  
  
  newlost_heatmap_data <- new_lost_sites %>%
    group_by(gene_type, repeat_class) %>%
    summarize(count = n()) %>% 
    mutate(gene_type = factor(gene_type, levels = filtering_args$gtf_genic_type),
           repeat_class = factor(repeat_class, levels = filtering_args$alu_repeat_levels))
  
  
  newlost_correlation_plot <- ggplot(newlost_heatmap_data, aes(x = repeat_class, y = fct_rev(gene_type), fill = log10(count))) +
    geom_tile() +
    scale_fill_viridis() +  # Adjust color scale as needed
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
    labs(title = "Count of overlapping genic locations and repeat motifs", 
         subtitle = "100% Edited sites",
         x = "Repeat Class",
         y = "Gene Type") 
  
  
   
    
       ### Create summary plot
    cat("\n")
    
    print(summary_plot <- (complete_correlation_plot) / (newlost_correlation_plot ) /
    (newlost_bar | newlost_genic_type | newlost_biotype)) +
      plot_annotation(title = paste0(region_to_explore, " : ", disease, " verse control"))
          

    if (is.null(region_to_explore)){
      ggsave(summary_plot,
             file = here::here("figures/update_figures/",
                               str_c(disease, "_newlost_sites.png")),
             device = "png",
             units = "cm",
             height = 44,
             width = 60)  
    } else if (!is.null(region_to_explore)){
      ggsave(summary_plot,
             file = here::here("figures/update_figures/",
                               str_c(region_to_explore, "_", disease, "_newlost_sites.png")),
             device = "png",
             units = "cm",
             height = 44,
             width = 60)  
    } 
    

    
    
    cat("\n\n")

  
  
  
}

2.10 Control vs. PD

[1] “Exploring : Control vs. PD” [1] “Exploring: Control vs. PD” [1] “Control group; 5 samples” [1] “PD group; 7 samples”

2.11 Control vs. DLB

[1] “Exploring : Control vs. DLB” [1] “Exploring: Control vs. DLB” [1] “Control group; 5 samples” [1] “DLB group; 6 samples”

2.12 Control vs. PDD

[1] “Exploring : Control vs. PDD” [1] “Exploring: Control vs. PDD” [1] “Control group; 5 samples” [1] “PDD group; 6 samples”

)